package com.ww.crawler.webmagic;

import java.util.ArrayList;
import java.util.List;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;

public class ZhiHuUserPageProcessor implements PageProcessor{
	// 抓取网站的相关配置，包括：编码、抓取间隔、重试次数等
	private Site site = Site.me().setRetryTimes(10).setSleepTime(1000)
		    .addHeader("Connection", "keep-alive")
		    .addHeader("Referer", "https://www.jointour.cn/")
		    .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0")
		    .setCharset("UTF-8").enableHttpProxyPool();
	// 用户数量
	private static int num = 0;
	// 搜索关键词
	private static String KEYWORD = "java";
	List<String> paths = new ArrayList<>();
	
	@Override
	public Site getSite() {
		return this.site;
	}

	@Override
	public void process(Page page) {
		System.out.println(">>>>>> |||| " + page.getUrl());
		if(page.getUrl().regex("https://www\\.zhihu\\.com/r/search\\?q=[\\s\\S]+&type=people" + "&offset=[0-9][0-9]").match()){
//			page.addTargetRequests(page.getHtml().xpath("//ul[@class='list users']/li/div/div[@class='body']/div[@class='line']")
//					.links().all());
			String nextPath = page.getJson().jsonPath("$.paging.next").get();
			List<String> userPaths = page.getJson().jsonPath("$.htmls[*]").all();
			for (String html : userPaths) {
				String path = new Html(html).xpath("//div[@class='left content']/a[@class='avatar-link left']").links().get();
				paths.add(path);
				System.out.println(">>>>>>" + path);
			}
			page.addTargetRequests(paths);
			page.addTargetRequest(nextPath);
//			System.out.println(nextPath);
		}else{
//			System.out.println("页面详情");
			num++;
			String name = page.getHtml().xpath("//h1[@class='ProfileHeader-title']/span[@class='ProfileHeader-name']/text()").get();
			page.putField("username", name);
			System.out.println(num);
		}
		
	}
	
	
	public static void main(String[] args) {
		Spider.create(new ZhiHuUserPageProcessor()).addUrl("https://www.zhihu.com/r/search?q="+KEYWORD + "&type=people&offset=10")
		.thread(1).run();
	}
}
